Blog Post 3

Post3

ManiShankerKamarapu

Amazon Review analysis

Author

Mani Shanker Kamarapu

Published

November 5, 2022

Introduction

In the last post, I have acquired the data for one product in Amazon and did the analysis and converted into corpus and done a wordcloud. In this blog I plan to scrape more reviews on different products and do preprocessing of data.

Loading the libraries

Code

library(polite)
library(rvest)

Warning: package 'rvest' was built under R version 4.2.2

Code

library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter()         masks stats::filter()
✖ readr::guess_encoding() masks rvest::guess_encoding()
✖ dplyr::lag()            masks stats::lag()

Code

library(stringr)
library(quanteda)

Package version: 3.2.3
Unicode version: 13.0
ICU version: 69.1
Parallel computing: 8 of 8 threads used.
See https://quanteda.io for tutorials and examples.

Code

library(tidyr)
library(RColorBrewer)
library(quanteda.textplots)
library(wordcloud)
library(wordcloud2)
library(devtools)

Loading required package: usethis

Code

library(quanteda.dictionaries)
library(quanteda.sentiment)


Attaching package: 'quanteda.sentiment'

The following object is masked from 'package:quanteda':

    data_dictionary_LSD2015

Code

knitr::opts_chunk$set(echo = TRUE)

Code

scrape_amazon <- function(ASIN, page_num){
  
  url_reviews <- paste0("https://www.amazon.com/product-reviews/",ASIN,"/?pageNumber=",page_num)
  
  doc <- read_html(url_reviews) # Assign results to `doc`
  
  # Review Title
  doc %>% 
    html_nodes("[class='a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold']") %>%
    html_text() -> review_title
  
  # Review Text
  doc %>% 
    html_nodes("[class='a-size-base review-text review-text-content']") %>%
    html_text() -> review_text
  
  # Number of stars in review
  doc %>%
    html_nodes("[data-hook='review-star-rating']") %>%
    html_text() -> review_star
  
  # Return a tibble
  tibble(review_title,
         review_text,
         review_star,
         page = page_num,
         ASIN) %>% return()
}

Code

reviews <- read_csv("amazonreview.csv")

New names:
Rows: 46450 Columns: 6
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(4): review_title, review_text, review_star, ASIN dbl (2): ...1, page
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...1`

Code

reviews

Code

clean_text <- function (text) {
  str_remove_all(text," ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>% 
    # Remove mentions
    str_remove_all("@[[:alnum:]_]*") %>% 
    # Replace "&" character reference with "and"
    str_replace_all("&amp;", "and") %>%
    # Remove punctuation, using a standard character class
    str_remove_all("[[:punct:]]") %>%
    # remove digits
    str_remove_all("[[:digit:]]") %>%
    # Replace any newline characters with a space
    str_replace_all("\\\n|\\\r", " ") %>%
    # remove strings like "<U+0001F9F5>"
    str_remove_all("<.*?>") %>% 
    # Make everything lowercase
    str_to_lower() %>%
    # Remove any trailing white space around the text and inside a string
    str_squish()
}

Code

reviews$clean_text <- clean_text(reviews$review_text) 
reviews <- reviews %>%
  drop_na(clean_text)
reviews

Code

text <- corpus(c(reviews$clean_text))
text <- dfm(tokens(text, remove_punct=TRUE, remove_numbers = TRUE) %>%
             tokens_select(pattern=stopwords("en"),
                            selection="remove"))
text

Document-feature matrix of: 46,447 documents, 74,610 features (99.95% sparse) and 0 docvars.
       features
docs    love fantasy ever since kid stories set creative worlds featuring
  text1    1      12    2     1   1       3   1        1      2         1
  text2    2       8    0     0   0       1   1        0      0         0
  text3    1       1    0     0   0       0   0        0      0         0
  text4    0       7    0     1   0       1   0        0      0         0
  text5    0      12    1     1   0       0   0        0      0         0
  text6    0       0    0     0   0       1   0        0      0         0
[ reached max_ndoc ... 46,441 more documents, reached max_nfeat ... 74,600 more features ]

Code

sum(ntoken(text))

[1] 2259952

Code

summary(corpus(c(reviews$clean_text)))

Code

word_counts <- as.data.frame(sort(colSums(text),dec=T))
colnames(word_counts) <- c("Frequency")
word_counts$word <- row.names(word_counts)
word_counts$Rank <- c(1:ncol(text))
word_counts

Code

text_dfm <- dfm_trim(text, min_termfreq = 50, docfreq_type = "prop")
# create fcm from dfm
text_fcm <- fcm(text_dfm)
text_fcm

Feature co-occurrence matrix of: 3,795 by 3,795 features.
          features
features    love fantasy ever since kid stories  set creative worlds groups
  love     13025    2234 3670  2395 337    1701 1155      224    161     72
  fantasy      0    1779  884   584  45     554  456       76     99     20
  ever         0       0  722   758  90     408  347       64     55     17
  since        0       0    0   455  60     271  309       36     28     15
  kid          0       0    0     0  48      23   27        6      5      5
  stories      0       0    0     0   0     313  190       28     33     33
  set          0       0    0     0   0       0  237       26     29     11
  creative     0       0    0     0   0       0    0       22      3      2
  worlds       0       0    0     0   0       0    0        0      8      4
  groups       0       0    0     0   0       0    0        0      0      2
[ reached max_feat ... 3,785 more features, reached max_nfeat ... 3,785 more features ]

Code

# pull the top features
top_features <- names(topfeatures(text_fcm, 50))
# retain only those top features as part of our matrix
even_text_fcm <- fcm_select(text_fcm, pattern = top_features, selection = "keep")
# check dimensions
dim(even_text_fcm)

[1] 50 50

Code

# compute size weight for vertices in network
size <- log(colSums(even_text_fcm))
# create plot
textplot_network(even_text_fcm, vertex_size = size / max(size) * 2)

Code

textplot_wordcloud(text, min_count = 100, max_words = 200, random_order = TRUE)

Further study

I will be adding more reviews and doing pre-processing and also plot some analysis plots and if possible also do some sentiment analysis.

--- title: "Blog Post 3" author: "Mani Shanker Kamarapu" desription: "Acquiring Data and Tidying data" date: "11/5/2022" format: html: df-print: paged css: styles.css toc: true code-fold: true code-copy: true code-tools: true categories: - Post3 - ManiShankerKamarapu - Amazon Review analysis --- ## Introduction In the last post, I have acquired the data for one product in Amazon and did the analysis and converted into corpus and done a wordcloud. In this blog I plan to scrape more reviews on different products and do preprocessing of data. ## Loading the libraries ```{r} library(polite) library(rvest) library(tidyverse) library(stringr) library(quanteda) library(tidyr) library(RColorBrewer) library(quanteda.textplots) library(wordcloud) library(wordcloud2) library(devtools) library(quanteda.dictionaries) library(quanteda.sentiment) knitr::opts_chunk$set(echo = TRUE) ``` ```{r} scrape_amazon <- function(ASIN, page_num){ url_reviews <- paste0("https://www.amazon.com/product-reviews/",ASIN,"/?pageNumber=",page_num) doc <- read_html(url_reviews) # Assign results to `doc` # Review Title doc %>% html_nodes("[class='a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold']") %>% html_text() -> review_title # Review Text doc %>% html_nodes("[class='a-size-base review-text review-text-content']") %>% html_text() -> review_text # Number of stars in review doc %>% html_nodes("[data-hook='review-star-rating']") %>% html_text() -> review_star # Return a tibble tibble(review_title, review_text, review_star, page = page_num, ASIN) %>% return() } ```                                                                                                     ```{r} reviews <- read_csv("amazonreview.csv") reviews ``` ```{r} clean_text <- function (text) { str_remove_all(text," ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)") %>% # Remove mentions str_remove_all("@[[:alnum:]_]*") %>% # Replace "&" character reference with "and" str_replace_all("&", "and") %>% # Remove punctuation, using a standard character class str_remove_all("[[:punct:]]") %>% # remove digits str_remove_all("[[:digit:]]") %>% # Replace any newline characters with a space str_replace_all("\\\n|\\\r", " ") %>% # remove strings like "<U+0001F9F5>" str_remove_all("<.*?>") %>% # Make everything lowercase str_to_lower() %>% # Remove any trailing white space around the text and inside a string str_squish() } ``` ```{r} reviews$clean_text <- clean_text(reviews$review_text) reviews <- reviews %>% drop_na(clean_text) reviews ``` ```{r} text <- corpus(c(reviews$clean_text)) text <- dfm(tokens(text, remove_punct=TRUE, remove_numbers = TRUE) %>% tokens_select(pattern=stopwords("en"), selection="remove")) text ``` ```{r} sum(ntoken(text)) ``` ```{r} summary(corpus(c(reviews$clean_text))) ``` ```{r} word_counts <- as.data.frame(sort(colSums(text),dec=T)) colnames(word_counts) <- c("Frequency") word_counts$word <- row.names(word_counts) word_counts$Rank <- c(1:ncol(text)) word_counts ``` ```{r} text_dfm <- dfm_trim(text, min_termfreq = 50, docfreq_type = "prop") # create fcm from dfm text_fcm <- fcm(text_dfm) text_fcm ``` ```{r} # pull the top features top_features <- names(topfeatures(text_fcm, 50)) # retain only those top features as part of our matrix even_text_fcm <- fcm_select(text_fcm, pattern = top_features, selection = "keep") # check dimensions dim(even_text_fcm) # compute size weight for vertices in network size <- log(colSums(even_text_fcm)) # create plot textplot_network(even_text_fcm, vertex_size = size / max(size) * 2) ``` ```{r} textplot_wordcloud(text, min_count = 100, max_words = 200, random_order = TRUE) ``` ## Further study I will be adding more reviews and doing pre-processing and also plot some analysis plots and if possible also do some sentiment analysis.